{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPT-J 6B"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load model and tokenizer from HuggingFace Hub\n",
    "\n",
    "GPT-J is loaded in fp32 mode by default which takes about 24GB CPU memory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6B\")\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\"EleutherAI/gpt-j-6B\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Use BMInf wrapper for low-resource inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import bminf\n",
    "with torch.cuda.device(0):\n",
    "    model = bminf.wrapper(model, quantization=False, memory_limit=8 << 30)  # 8GB"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. See the GPU usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|===========================================================================|\n",
      "|                  PyTorch CUDA memory summary, device ID 0                 |\n",
      "|---------------------------------------------------------------------------|\n",
      "|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |\n",
      "|===========================================================================|\n",
      "|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |\n",
      "|---------------------------------------------------------------------------|\n",
      "| Allocated memory      |    9297 MB |    9297 MB |    9297 MB |       0 B  |\n",
      "|       from large pool |    9296 MB |    9296 MB |    9296 MB |       0 B  |\n",
      "|       from small pool |       1 MB |       1 MB |       1 MB |       0 B  |\n",
      "|---------------------------------------------------------------------------|\n",
      "| Active memory         |    9297 MB |    9297 MB |    9297 MB |       0 B  |\n",
      "|       from large pool |    9296 MB |    9296 MB |    9296 MB |       0 B  |\n",
      "|       from small pool |       1 MB |       1 MB |       1 MB |       0 B  |\n",
      "|---------------------------------------------------------------------------|\n",
      "| GPU reserved memory   |    9298 MB |    9298 MB |    9298 MB |       0 B  |\n",
      "|       from large pool |    9296 MB |    9296 MB |    9296 MB |       0 B  |\n",
      "|       from small pool |       2 MB |       2 MB |       2 MB |       0 B  |\n",
      "|---------------------------------------------------------------------------|\n",
      "| Non-releasable memory |  710656 B  |   18400 KB |   34800 KB |   34106 KB |\n",
      "|       from large pool |       0 B  |   16384 KB |   32768 KB |   32768 KB |\n",
      "|       from small pool |  710656 B  |    2032 KB |    2032 KB |    1338 KB |\n",
      "|---------------------------------------------------------------------------|\n",
      "| Allocations           |     125    |     125    |     125    |       0    |\n",
      "|       from large pool |      72    |      72    |      72    |       0    |\n",
      "|       from small pool |      53    |      53    |      53    |       0    |\n",
      "|---------------------------------------------------------------------------|\n",
      "| Active allocs         |     125    |     125    |     125    |       0    |\n",
      "|       from large pool |      72    |      72    |      72    |       0    |\n",
      "|       from small pool |      53    |      53    |      53    |       0    |\n",
      "|---------------------------------------------------------------------------|\n",
      "| GPU reserved segments |      65    |      65    |      65    |       0    |\n",
      "|       from large pool |      64    |      64    |      64    |       0    |\n",
      "|       from small pool |       1    |       1    |       1    |       0    |\n",
      "|---------------------------------------------------------------------------|\n",
      "| Non-releasable allocs |       1    |       2    |       3    |       2    |\n",
      "|       from large pool |       0    |       1    |       2    |       2    |\n",
      "|       from small pool |       1    |       1    |       1    |       0    |\n",
      "|---------------------------------------------------------------------------|\n",
      "| Oversize allocations  |       0    |       0    |       0    |       0    |\n",
      "|---------------------------------------------------------------------------|\n",
      "| Oversize GPU segments |       0    |       0    |       0    |       0    |\n",
      "|===========================================================================|\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(torch.cuda.memory_summary())"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Run generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    }
   ],
   "source": [
    "prompt = \"To be or not to be, that\"\n",
    "input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
    "gen_tokens = model.generate(\n",
    "    input_ids.cuda(),\n",
    "    do_sample=True,\n",
    "    temperature=0.9,\n",
    "    max_length=20\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Get the generated text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['To be or not to be, that is the question — that has been the question, and still']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.batch_decode(gen_tokens)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "29d71688ffbe7d005e79abd80e578fa5cab2d2c2e11d1955de002b95fcc7229b"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
